import pandas as pd
import numpy as np
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from plotly.offline import iplot
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
import ipywidgets as widgets
from keras.models import load_model
from ipywidgets import *
print("Libraries loaded!")
Libraries loaded!
# !jupyter nbextension enable – py widgetsnbextension – sys-prefix
# !jupyter serverextension enable voila – sys-prefix
print()
print("Loading data.....")
customer_data = pd.read_csv('E-Commerce_Data.csv', encoding='ISO-8859-1',dtype={'InvoiceID': str})
customer_data['InvoiceDate'] = pd.to_datetime(customer_data['InvoiceDate']) #convert to python datetime object
print("Data loaded!")
Loading data..... Data loaded!
print()
def check_data(dataframe):
print(" *********************************SHAPE******************************")
print(dataframe.shape)
print()
print()
print("*********************************COLUMNS******************************")
print(dataframe.columns)
print()
print()
print("**********************************TYPES*******************************")
print(dataframe.dtypes)
print()
print()
print("**********************************HEAD*******************************")
print(dataframe.head())
print()
print()
print("**********************************TAIL*******************************")
print(dataframe.tail())
print()
print()
print("*******************************DESCRIPTION***************************")
print(dataframe.describe().T)
print()
check_data(customer_data)
*********************************SHAPE******************************
(541909, 8)
*********************************COLUMNS******************************
Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
'UnitPrice', 'CustomerID', 'Country'],
dtype='object')
**********************************TYPES*******************************
InvoiceNo object
StockCode object
Description object
Quantity int64
InvoiceDate datetime64[ns]
UnitPrice float64
CustomerID float64
Country object
dtype: object
**********************************HEAD*******************************
InvoiceNo StockCode Description Quantity \
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
1 536365 71053 WHITE METAL LANTERN 6
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
InvoiceDate UnitPrice CustomerID Country
0 2010-12-01 08:26:00 2.55 17850.0 United Kingdom
1 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
2 2010-12-01 08:26:00 2.75 17850.0 United Kingdom
3 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
4 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
**********************************TAIL*******************************
InvoiceNo StockCode Description Quantity \
541904 581587 22613 PACK OF 20 SPACEBOY NAPKINS 12
541905 581587 22899 CHILDREN'S APRON DOLLY GIRL 6
541906 581587 23254 CHILDRENS CUTLERY DOLLY GIRL 4
541907 581587 23255 CHILDRENS CUTLERY CIRCUS PARADE 4
541908 581587 22138 BAKING SET 9 PIECE RETROSPOT 3
InvoiceDate UnitPrice CustomerID Country
541904 2011-12-09 12:50:00 0.85 12680.0 France
541905 2011-12-09 12:50:00 2.10 12680.0 France
541906 2011-12-09 12:50:00 4.15 12680.0 France
541907 2011-12-09 12:50:00 4.15 12680.0 France
541908 2011-12-09 12:50:00 4.95 12680.0 France
*******************************DESCRIPTION***************************
count mean std min 25% 50% \
Quantity 541909.0 9.552250 218.081158 -80995.00 1.00 3.00
UnitPrice 541909.0 4.611114 96.759853 -11062.06 1.25 2.08
CustomerID 406829.0 15287.690570 1713.600303 12346.00 13953.00 15152.00
75% max
Quantity 10.00 80995.0
UnitPrice 4.13 38970.0
CustomerID 16791.00 18287.0
#Get the total number of missing values for each attribute
print()
print(customer_data.isnull().sum())
print()
InvoiceNo 0 StockCode 0 Description 1454 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 135080 Country 0 dtype: int64
#Check for repeated rows
print()
print("Duplicated values:", customer_data.duplicated().sum())
print()
Duplicated values: 5268
#Get the unique number of countries within the data set
print()
temp = customer_data.groupby(['CustomerID', 'InvoiceNo', 'Country']).count()
#temp = customer_data.groupby(['Country']).count()
temp = temp.reset_index(drop = False)
countries = temp['Country'].value_counts()
print('Number of countries in the dataframe: {}' .format(len(countries)))
Number of countries in the dataframe: 37
#Visualize the total number of orders for different countries
data = dict(type='choropleth',
locations = countries.index,
locationmode = 'country names', z = countries,
text = countries.index, colorbar = {'title':'Order nb.'},
colorscale=[[0, 'rgb(224,255,255)'],
[0.01, 'rgb(166,206,227)'], [0.02, 'rgb(31,120,180)'],
[0.03, 'rgb(178,223,138)'], [0.05, 'rgb(51,160,44)'],
[0.10, 'rgb(251,154,153)'], [0.20, 'rgb(255,255,0)'],
[1, 'rgb(227,26,28)']],
reversescale = False)
#_______________________
layout = dict(title='Number of orders per country',
geo = dict(showframe = True, projection={'type':'mercator'}))
#______________
choromap = go.Figure(data = [data], layout = layout)
iplot(choromap, validate=False)
print()
#Total number of customers, products and transactions
pd.DataFrame([{'products': len(customer_data['StockCode'].value_counts()),
'transactions': len(customer_data['InvoiceNo'].value_counts()),
'customers': len(customer_data['CustomerID'].value_counts()),
}], columns = ['products', 'transactions', 'customers'], index = ['quantity'])
| products | transactions | customers | |
|---|---|---|---|
| quantity | 4070 | 25900 | 4372 |
#Number of products purchased in every transaction
print()
temp = customer_data.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate'].count()
no_products_per_basket = temp.rename(columns = {'InvoiceDate':'Number of products'})
no_products_per_basket[:10].sort_values('CustomerID')
| CustomerID | InvoiceNo | Number of products | |
|---|---|---|---|
| 0 | 12346.0 | 541431 | 1 |
| 1 | 12346.0 | C541433 | 1 |
| 2 | 12347.0 | 537626 | 31 |
| 3 | 12347.0 | 542237 | 29 |
| 4 | 12347.0 | 549222 | 24 |
| 5 | 12347.0 | 556201 | 18 |
| 6 | 12347.0 | 562032 | 22 |
| 7 | 12347.0 | 573511 | 47 |
| 8 | 12347.0 | 581180 | 11 |
| 9 | 12348.0 | 539318 | 17 |
particular_code_list = customer_data[customer_data['StockCode'].str.contains(
'^[a-zA-Z]+', regex=True)]['StockCode'].unique()
particular_code_list
array(['POST', 'D', 'C2', 'DOT', 'M', 'BANK CHARGES', 'S', 'AMAZONFEE',
'DCGS0076', 'DCGS0003', 'gift_0001_40', 'DCGS0070', 'm',
'gift_0001_50', 'gift_0001_30', 'gift_0001_20', 'DCGS0055',
'DCGS0072', 'DCGS0074', 'DCGS0069', 'DCGS0057', 'DCGSSBOY',
'DCGSSGIRL', 'gift_0001_10', 'PADS', 'DCGS0004', 'DCGS0073',
'DCGS0071', 'DCGS0068', 'DCGS0067', 'DCGS0066P', 'B', 'CRUK'],
dtype=object)
particular_code_description = customer_data[customer_data['StockCode'] == 'B']
particular_code_description
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 299982 | A563185 | B | Adjust bad debt | 1 | 2011-08-12 14:50:00 | 11062.06 | NaN | United Kingdom |
| 299983 | A563186 | B | Adjust bad debt | 1 | 2011-08-12 14:51:00 | -11062.06 | NaN | United Kingdom |
| 299984 | A563187 | B | Adjust bad debt | 1 | 2011-08-12 14:52:00 | -11062.06 | NaN | United Kingdom |
#Fill the missing values
customer_data['CustomerID'].fillna(inplace = True,value=customer_data['CustomerID'].mean())
customer_data['Description'].fillna(inplace = True, value='')
print("Done!")
Done!
#Check if there are still missing values
customer_data.isnull().sum()
InvoiceNo 0 StockCode 0 Description 0 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 0 Country 0 dtype: int64
#remove duplicated rows
customer_data.drop_duplicates(inplace=True)
print("Done!")
Done!
#Check if there are still duplicates
print("Repeated entries:",customer_data.duplicated().sum())
Repeated entries: 0
print("Length of data set now", len(customer_data))
Length of data set now 536641
#Gather all orders that might indicate a cancelled order
cancelledOrders=customer_data[customer_data["Quantity"]<0]
cancelledOrders.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 141 | C536379 | D | Discount | -1 | 2010-12-01 09:41:00 | 27.50 | 14527.0 | United Kingdom |
| 154 | C536383 | 35004C | SET OF 3 COLOURED FLYING DUCKS | -1 | 2010-12-01 09:49:00 | 4.65 | 15311.0 | United Kingdom |
| 235 | C536391 | 22556 | PLASTERS IN TIN CIRCUS PARADE | -12 | 2010-12-01 10:24:00 | 1.65 | 17548.0 | United Kingdom |
| 236 | C536391 | 21984 | PACK OF 12 PINK PAISLEY TISSUES | -24 | 2010-12-01 10:24:00 | 0.29 | 17548.0 | United Kingdom |
| 237 | C536391 | 21983 | PACK OF 12 BLUE PAISLEY TISSUES | -24 | 2010-12-01 10:24:00 | 0.29 | 17548.0 | United Kingdom |
#Check if negative quantity correspond to cancelled transaction
display(customer_data.sort_values('CustomerID')[:5])
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 61619 | 541431 | 23166 | MEDIUM CERAMIC TOP STORAGE JAR | 74215 | 2011-01-18 10:01:00 | 1.04 | 12346.0 | United Kingdom |
| 61624 | C541433 | 23166 | MEDIUM CERAMIC TOP STORAGE JAR | -74215 | 2011-01-18 10:17:00 | 1.04 | 12346.0 | United Kingdom |
| 428981 | 573511 | 22992 | REVOLVER WOODEN RULER | 12 | 2011-10-31 12:25:00 | 1.95 | 12347.0 | Iceland |
| 429001 | 573511 | 20719 | WOODLAND CHARLOTTE BAG | 10 | 2011-10-31 12:25:00 | 0.85 | 12347.0 | Iceland |
| 429002 | 573511 | 23162 | REGENCY TEA STRAINER | 8 | 2011-10-31 12:25:00 | 3.75 | 12347.0 | Iceland |
#Inspect if all orders with negative quantity indicate cancelled orders
data_check = customer_data[customer_data['Quantity']<0][['CustomerID', 'Quantity', 'StockCode', 'Description', 'UnitPrice']]
for index, col in data_check.iterrows():
if customer_data[(customer_data['CustomerID'] == col[0]) & (customer_data['Quantity'] == -col[1]) &
(customer_data['Description'] == col[2])].shape[0] == 0:
print(data_check.loc[index])
print('Hypothesis NOT fulfilled')
break
CustomerID 14527.0 Quantity -1 StockCode D Description Discount UnitPrice 27.5 Name: 141, dtype: object Hypothesis NOT fulfilled
#Perform the previous check but ignore discount.
data_check = customer_data[(customer_data['Quantity']<0) & (customer_data['Description'] != 'Discount')][['CustomerID', 'Quantity', 'StockCode', 'Description', 'UnitPrice']]
for index, col in data_check.iterrows():
if customer_data[(customer_data['CustomerID'] == col[0]) & (customer_data['Quantity'] == -col[1]) &
(customer_data['Description'] == col[2])].shape[0] == 0:
print(data_check.loc[index])
print('HYPOTHESIS not fulfilled')
break
CustomerID 15311.0 Quantity -1 StockCode 35004C Description SET OF 3 COLOURED FLYING DUCKS UnitPrice 4.65 Name: 154, dtype: object HYPOTHESIS not fulfilled
#Gather all entries that relate to cancelled orders and store the quantity cancelled for each cancelled order
data_cleaned = customer_data.copy(deep =True)
data_cleaned['QuantityCanceled'] = 0
entry_to_remove = []
doubtful_entry = []
for index, col in customer_data.iterrows():
if (col['Quantity']> 0) or col['Description'] == 'Discount':
continue
data_test = customer_data[(customer_data['CustomerID'] == col['CustomerID']) & (customer_data['StockCode'] == col['StockCode'])
& (customer_data['InvoiceDate'] < col['InvoiceDate']) & (customer_data['Quantity'] > 0)].copy()
#**********************************************
#Cancellation without counterpart
if (data_test.shape[0] == 0):
doubtful_entry.append(index)
#Cancelation with a counterpart
elif (data_test.shape[0] == 1):
counterpart_index = data_test.index[0]
data_cleaned.loc[counterpart_index, 'QuantityCanceled'] = -col['Quantity']
entry_to_remove.append(index)
#Entries with several counterparts. We delete the last one
elif (data_test.shape[0]>1):
data_test.sort_index(axis=0, ascending=False, inplace=True)
for ind, val in data_test.iterrows():
if val['Quantity'] < -col['Quantity']: continue
data_cleaned.loc[ind, 'QuantityCanceled'] = -col['Quantity']
entry_to_remove.append(index)
break
print("Length of cancelled orders without a counterpart and ones with one or more counterpart:", len(doubtful_entry + entry_to_remove))
Length of cancelled orders without a counterpart and ones with one or more counterpart: 9771
#Remove entries that do not have a counterpart and ones that have atleast one counterpart
data_cleaned.drop(entry_to_remove, axis=0, inplace=True)
data_cleaned.drop(doubtful_entry, axis=0, inplace=True)
print("Done!")
Done!
#Check for entries that have negative quantity
remaining_entries = data_cleaned[(data_cleaned['Quantity']<0) & (data_cleaned['StockCode']!='D')]
print("Remaining entries to delete: {}".format(remaining_entries.shape[0]))
remaining_entries[:5]
Remaining entries to delete: 739
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | QuantityCanceled | |
|---|---|---|---|---|---|---|---|---|---|
| 7188 | 536996 | 22712 | -20 | 2010-12-03 15:30:00 | 0.0 | 15287.69057 | United Kingdom | 0 | |
| 7201 | 537009 | 84534B | -80 | 2010-12-03 15:38:00 | 0.0 | 15287.69057 | United Kingdom | 0 | |
| 7202 | 537010 | 22162 | -40 | 2010-12-03 15:38:00 | 0.0 | 15287.69057 | United Kingdom | 0 | |
| 7205 | 537013 | 35965 | -25 | 2010-12-03 15:40:00 | 0.0 | 15287.69057 | United Kingdom | 0 | |
| 7291 | 537027 | 18098C | -140 | 2010-12-03 16:36:00 | 0.0 | 15287.69057 | United Kingdom | 0 |
#Remove remaining supsicious entries
data_cleaned.drop(remaining_entries.index,axis=0,inplace=True)
print("Done!")
Done!
print("Number of entries to delete: {}".format(data_cleaned[(data_cleaned['Quantity']<0) & (data_cleaned['StockCode']!='D')].shape[0]))
remaining_entries[:5]
print("Length of data frame now: {}", len(data_cleaned))
Number of entries to delete: 0
Length of data frame now: {} 526131
#Compute total amount for each entry
data_cleaned['TotalPrice'] = data_cleaned['UnitPrice'] * (data_cleaned['Quantity'] - data_cleaned['QuantityCanceled'])
data_cleaned['TotalQuantity'] = data_cleaned['Quantity'] - data_cleaned['QuantityCanceled']
data_cleaned.sort_values('CustomerID')[:5]
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | QuantityCanceled | TotalPrice | TotalQuantity | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 61619 | 541431 | 23166 | MEDIUM CERAMIC TOP STORAGE JAR | 74215 | 2011-01-18 10:01:00 | 1.04 | 12346.0 | United Kingdom | 74215 | 0.00 | 0 |
| 535014 | 581180 | 23508 | MINI PLAYING CARDS DOLLY GIRL | 20 | 2011-12-07 15:52:00 | 0.42 | 12347.0 | Iceland | 0 | 8.40 | 20 |
| 535011 | 581180 | 21265 | PINK GOOSE FEATHER TREE 60CM | 12 | 2011-12-07 15:52:00 | 1.95 | 12347.0 | Iceland | 0 | 23.40 | 12 |
| 14968 | 537626 | 20782 | CAMOUFLAGE EAR MUFF HEADPHONES | 6 | 2010-12-07 14:57:00 | 5.49 | 12347.0 | Iceland | 0 | 32.94 | 6 |
| 286621 | 562032 | 23308 | SET OF 60 VINTAGE LEAF CAKE CASES | 24 | 2011-08-02 08:48:00 | 0.55 | 12347.0 | Iceland | 0 | 13.20 | 24 |
#Sum of purchases/ user & order
order_total_df = data_cleaned.groupby(by= ['CustomerID', 'InvoiceNo', 'InvoiceDate'], as_index=False)['TotalPrice', 'TotalQuantity'].sum()
basket_price = order_total_df.rename(columns= {'TotalPrice' :'Basket Price'})
#Selection of important entries
basket_price=basket_price[basket_price['Basket Price']>0]
basket_price.sort_values('CustomerID')[:6]
C:\Users\glori\AppData\Local\Temp\ipykernel_22296\2682052397.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
| CustomerID | InvoiceNo | InvoiceDate | Basket Price | TotalQuantity | |
|---|---|---|---|---|---|
| 1 | 12347.0 | 537626 | 2010-12-07 14:57:00 | 711.79 | 319 |
| 2 | 12347.0 | 542237 | 2011-01-26 14:30:00 | 475.39 | 315 |
| 3 | 12347.0 | 549222 | 2011-04-07 10:43:00 | 636.25 | 483 |
| 4 | 12347.0 | 556201 | 2011-06-09 13:01:00 | 382.52 | 196 |
| 5 | 12347.0 | 562032 | 2011-08-02 08:48:00 | 584.91 | 277 |
| 6 | 12347.0 | 573511 | 2011-10-31 12:25:00 | 1294.32 | 676 |
basket_price.loc[basket_price['Basket Price'].idxmax()]
price_range = [0, 50, 100, 200, 500, 1000, 5000, 50000]
count_price = []
for i, price in enumerate(price_range):
if i==0: continue
items_count = basket_price[(basket_price['Basket Price'] < price) &
(basket_price['Basket Price']>price_range[i-1])]['Basket Price'].count()
count_price.append(items_count)
#*******************************************************************
#Representation of the purchases amount
plt.rc('font', weight='bold')
f, ax = plt.subplots(figsize=(10,7))
labels = ['{}<.<{}'.format(price_range[i-1], s) for i,s in enumerate(price_range) if i!=0]
sizes = count_price
explode = [0.0 if sizes[i] < 100 else 0.0 for i in range(len(sizes))]
ax.pie(sizes, explode = explode, labels=labels, autopct = lambda x:'{:1.0f}%'.format(x)
if x>1 else '', shadow =False, startangle=0)
ax.axis('equal')
f.text(0.5, 1.01, "Representation of the purchases amount",
ha = 'center', fontsize = 18);
#Compute how recent a single customer performed a transaction
df_recency = basket_price.groupby(by='CustomerID',
as_index=False)['InvoiceDate'].max()
df_recency.columns = ['CustomerID', 'LastPurchaseDate']
recent_date = df_recency['LastPurchaseDate'].max()
df_recency['Recency'] = df_recency['LastPurchaseDate'].apply(
lambda x: (recent_date - x).days)
df_recency
| CustomerID | LastPurchaseDate | Recency | |
|---|---|---|---|
| 0 | 12347.0 | 2011-12-07 15:52:00 | 1 |
| 1 | 12348.0 | 2011-09-25 13:13:00 | 74 |
| 2 | 12349.0 | 2011-11-21 09:51:00 | 18 |
| 3 | 12350.0 | 2011-02-02 16:01:00 | 309 |
| 4 | 12352.0 | 2011-11-03 14:37:00 | 35 |
| ... | ... | ... | ... |
| 4323 | 18280.0 | 2011-03-07 09:52:00 | 277 |
| 4324 | 18281.0 | 2011-06-12 10:53:00 | 180 |
| 4325 | 18282.0 | 2011-12-02 11:43:00 | 7 |
| 4326 | 18283.0 | 2011-12-06 12:02:00 | 3 |
| 4327 | 18287.0 | 2011-10-28 09:29:00 | 42 |
4328 rows × 3 columns
#Compute the number of times a customer has made a transaction
frequency_df = basket_price.groupby(
by=['CustomerID'], as_index=False)['InvoiceDate'].count()
frequency_df.columns = ['CustomerID', 'Frequency']
frequency_df.head()
| CustomerID | Frequency | |
|---|---|---|
| 0 | 12347.0 | 7 |
| 1 | 12348.0 | 4 |
| 2 | 12349.0 | 1 |
| 3 | 12350.0 | 1 |
| 4 | 12352.0 | 7 |
#Compute the total amount spent by a customer within the given time.
monetary_df = basket_price.groupby(by='CustomerID', as_index=False)['Basket Price'].sum()
monetary_df.columns = ['CustomerID', 'Monetary']
monetary_df.head()
| CustomerID | Monetary | |
|---|---|---|
| 0 | 12347.0 | 4310.00 |
| 1 | 12348.0 | 1797.24 |
| 2 | 12349.0 | 1757.55 |
| 3 | 12350.0 | 334.40 |
| 4 | 12352.0 | 2385.71 |
#Generate the RFM data set
rf_df = df_recency.merge(frequency_df, on='CustomerID')
rfm_df = rf_df.merge(monetary_df, on='CustomerID').drop(
columns='LastPurchaseDate')
rfm_df.head()
| CustomerID | Recency | Frequency | Monetary | |
|---|---|---|---|---|
| 0 | 12347.0 | 1 | 7 | 4310.00 |
| 1 | 12348.0 | 74 | 4 | 1797.24 |
| 2 | 12349.0 | 18 | 1 | 1757.55 |
| 3 | 12350.0 | 309 | 1 | 334.40 |
| 4 | 12352.0 | 35 | 7 | 2385.71 |
plt.figure(figsize=(12,12))
plt.title("RFM variables distribution")
rfm_df.boxplot()
<AxesSubplot:title={'center':'RFM variables distribution'}>
#Data normalization
print("Scaling data....")
rfm_normalized = rfm_df[['Monetary', 'Frequency', 'Recency']]
scaler = StandardScaler()
scaler.fit(rfm_normalized)
rfm_normalized = pd.DataFrame(scaler.transform(rfm_normalized), columns=rfm_normalized.columns)
print("Done!")
Scaling data.... Done!
rfm_normalized
| Monetary | Frequency | Recency | |
|---|---|---|---|
| 0 | 0.072829 | 0.107094 | -0.905229 |
| 1 | -0.019727 | -0.025766 | -0.176468 |
| 2 | -0.021189 | -0.158625 | -0.735517 |
| 3 | -0.073610 | -0.158625 | 2.169545 |
| 4 | 0.001949 | 0.107094 | -0.565806 |
| ... | ... | ... | ... |
| 4323 | -0.079276 | -0.158625 | 1.850088 |
| 4324 | -0.082951 | -0.158625 | 0.881734 |
| 4325 | -0.079423 | -0.114339 | -0.845331 |
| 4326 | -0.010582 | 0.505672 | -0.885263 |
| 4327 | -0.018253 | -0.070052 | -0.495925 |
4328 rows × 3 columns
#Check for outliers after scaling
plt.figure(figsize=(12,12))
plt.title("Outlier variable distribution")
rfm_normalized.boxplot()
<AxesSubplot:title={'center':'Outlier variable distribution'}>
#Outlier information
rfm_normalized[rfm_normalized['Monetary']>60]
| Monetary | Frequency | Recency | |
|---|---|---|---|
| 2153 | 62.558306 | 61.886723 | -0.915212 |
#Remove outlier information
print()
indexID = rfm_normalized[rfm_normalized['Monetary'] > 60].index
rfm_normalized.drop(indexID, inplace=True)
print("Done!")
Done!
# Recheck the distribution of data
plt.figure(figsize=(12,12))
plt.title("RFM variable distribution")
rfm_normalized.boxplot()
<AxesSubplot:title={'center':'RFM variable distribution'}>
#Compute the optimal numer of clusters by
inertia = []
num_clusters =range(2,8)
for i in num_clusters:
kmeans=KMeans(n_clusters=i, max_iter=50)
kmeans.fit(rfm_normalized)
inertia.append(kmeans.inertia_)
plt.figure(figsize=(16,8))
plt.plot(num_clusters, inertia, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('inertia')
plt.title('The Elbow Method showing the optimal k')
plt.show()
print()
#Silhouette analysis
print("Calculating silhouette scores for different values of k.....")
for k in num_clusters:
#initialize kmeans
kmeans = KMeans(n_clusters=k, max_iter=50)
kmeans.fit(rfm_normalized)
cluster_labels = kmeans.labels_
#silhouette score
silhouette_avg = silhouette_score(rfm_normalized, cluster_labels)
print("For n_clusters={0}, the silhouette score is {1}".format(k, silhouette_avg))
Calculating silhouette scores for different values of k..... For n_clusters=2, the silhouette score is 0.6873406120911523 For n_clusters=3, the silhouette score is 0.5845143260289865 For n_clusters=4, the silhouette score is 0.6030597736942671 For n_clusters=5, the silhouette score is 0.492318872510545 For n_clusters=6, the silhouette score is 0.5180847330599242 For n_clusters=7, the silhouette score is 0.5110443562977901
#Visulaization of silhoutte analysis
print("Visulization of silhouette scores against clustered data for different values of k")
for i, k in enumerate([2, 3, 4, 5, 6]):
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(15,5))
#fig.set_size_inches(15, 5)
# Run the Kmeans algorithm
km=KMeans(n_clusters=k)
labels = km.fit_predict(rfm_normalized)
centroids = km.cluster_centers_
# Get silhouette samples
silhouette_vals = silhouette_samples(rfm_normalized, labels)
# Silhouette plot
y_ticks = []
y_lower, y_upper = 0, 0
for i, cluster in enumerate(np.unique(labels)):
cluster_silhouette_vals = silhouette_vals[labels == cluster]
cluster_silhouette_vals.sort()
y_upper += len(cluster_silhouette_vals)
ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
y_lower += len(cluster_silhouette_vals)
# Get the average silhouette score and plot it
avg_score = np.mean(silhouette_vals)
ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
ax1.set_yticks([])
ax1.set_xlim([-0.1, 1])
ax1.set_xlabel('Silhouette coefficient values')
ax1.set_ylabel('Cluster labels')
ax1.set_title('Silhouette plot for the various clusters', y=1.02);
# Scatter plot of data colored with labels
ax2.scatter(rfm_normalized['Monetary'], rfm_normalized['Recency'], c=labels)
ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250)
# ax2.set_xlim([-2, 2])
# ax2.set_xlim([-2, 2])
ax2.set_xlabel('Monetary value')
ax2.set_ylabel('Recency')
ax2.set_title('Visualization of clustered data', y=1.02)
ax2.set_aspect('equal')
plt.tight_layout()
plt.suptitle(f'Silhouette analysis using k = {k}',
fontsize=16, fontweight='semibold', y=1.05);
Visulization of silhouette scores against clustered data for different values of k
#Segmentation of customers
kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(rfm_normalized)
KMeans(max_iter=50, n_clusters=4)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(max_iter=50, n_clusters=4)
rfm_normalized.loc[:, 'CustomerID'] = rfm_df['CustomerID']
rfm_normalized
| Monetary | Frequency | Recency | CustomerID | |
|---|---|---|---|---|
| 0 | 0.072829 | 0.107094 | -0.905229 | 12347.0 |
| 1 | -0.019727 | -0.025766 | -0.176468 | 12348.0 |
| 2 | -0.021189 | -0.158625 | -0.735517 | 12349.0 |
| 3 | -0.073610 | -0.158625 | 2.169545 | 12350.0 |
| 4 | 0.001949 | 0.107094 | -0.565806 | 12352.0 |
| ... | ... | ... | ... | ... |
| 4323 | -0.079276 | -0.158625 | 1.850088 | 18280.0 |
| 4324 | -0.082951 | -0.158625 | 0.881734 | 18281.0 |
| 4325 | -0.079423 | -0.114339 | -0.845331 | 18282.0 |
| 4326 | -0.010582 | 0.505672 | -0.885263 | 18283.0 |
| 4327 | -0.018253 | -0.070052 | -0.495925 | 18287.0 |
4327 rows × 4 columns
rfm_normalized['Cluster'] = kmeans.labels_
rfm_normalized
| Monetary | Frequency | Recency | CustomerID | Cluster | |
|---|---|---|---|---|---|
| 0 | 0.072829 | 0.107094 | -0.905229 | 12347.0 | 1 |
| 1 | -0.019727 | -0.025766 | -0.176468 | 12348.0 | 1 |
| 2 | -0.021189 | -0.158625 | -0.735517 | 12349.0 | 1 |
| 3 | -0.073610 | -0.158625 | 2.169545 | 12350.0 | 0 |
| 4 | 0.001949 | 0.107094 | -0.565806 | 12352.0 | 1 |
| ... | ... | ... | ... | ... | ... |
| 4323 | -0.079276 | -0.158625 | 1.850088 | 18280.0 | 0 |
| 4324 | -0.082951 | -0.158625 | 0.881734 | 18281.0 | 3 |
| 4325 | -0.079423 | -0.114339 | -0.845331 | 18282.0 | 1 |
| 4326 | -0.010582 | 0.505672 | -0.885263 | 18283.0 | 1 |
| 4327 | -0.018253 | -0.070052 | -0.495925 | 18287.0 | 1 |
4327 rows × 5 columns
rfm_normalized['Cluster'].value_counts()
1 2868 3 810 0 626 2 23 Name: Cluster, dtype: int64
plt.figure(figsize=(12,12))
plt.title("Monetary variable distribution within each cluster")
sns.boxplot(x='Cluster', y='Monetary', data=rfm_normalized)
<AxesSubplot:title={'center':'Monetary variable distribution within each cluster'}, xlabel='Cluster', ylabel='Monetary'>
plt.figure(figsize=(12,12))
plt.title("Frequency variable distribution within each cluster")
sns.boxplot(x='Cluster', y='Frequency', data=rfm_normalized)
<AxesSubplot:title={'center':'Frequency variable distribution within each cluster'}, xlabel='Cluster', ylabel='Frequency'>
plt.figure(figsize=(12,12))
plt.title("Recency variable distribution within each cluster")
sns.boxplot(x='Cluster', y='Recency', data=rfm_normalized)
<AxesSubplot:title={'center':'Recency variable distribution within each cluster'}, xlabel='Cluster', ylabel='Recency'>
plt.figure(figsize=(12,12))
plt.title("Clustering: Recency vs Monetary")
RM=sns.scatterplot(x='Recency', y='Monetary', hue='Cluster', palette="Set2", data=rfm_normalized)
plt.figure(figsize=(12,12))
plt.title("Clustering: Frequency vs Monetary")
FM=sns.scatterplot(x='Frequency', y='Monetary', hue='Cluster',palette="Set2", data=rfm_normalized)
plt.figure(figsize=(12,12))
plt.title("Clustering: Recency vs Frequency")
RF=sns.scatterplot(x='Recency', y='Frequency', hue='Cluster', palette="Set2", data=rfm_normalized)
plt.rcParams["figure.figsize"] = (25,25)
fig = plt.figure(1)
plt.clf()
ax = Axes3D(fig, rect = [0, 0, .95, 1],
elev = 48,
azim = 134)
plt.cla()
ax.scatter(rfm_normalized['Frequency'], rfm_normalized['Recency'], rfm_normalized['Monetary'],
c = rfm_normalized['Cluster'],
s = 200,
cmap = "spring",
alpha = 0.5,
edgecolor = 'darkgrey')
ax.set_xlabel('Frequency',
fontsize = 16)
ax.set_ylabel('Recency',
fontsize = 16)
ax.set_zlabel('Monetary',
fontsize = 16)
plt.show()
# import plotly.express as px
# fig2 = px.scatter_3d(rfm_normalized, x="Frequency", y="Recency", z="Monetary", color="Cluster",)
# fig2.update_layout(title="3 Features Representation")
# fig2.show()
C:\Users\glori\AppData\Local\Temp\ipykernel_22296\408342758.py:4: MatplotlibDeprecationWarning: Axes3D(fig) adding itself to the figure is deprecated since 3.4. Pass the keyword argument auto_add_to_figure=False and use fig.add_axes(ax) to suppress this warning. The default value of auto_add_to_figure will change to False in mpl3.5 and True values will no longer work in 3.6. This is consistent with other Axes classes.
#Selection and distribution of independent and dependent variables
X = rfm_normalized.iloc[:,:3].values
X[:10]
array([[ 7.28288079e-02, 1.07093859e-01, -9.05228987e-01],
[-1.97274636e-02, -2.57655586e-02, -1.76467627e-01],
[-2.11894251e-02, -1.58624976e-01, -7.35517437e-01],
[-7.36104514e-02, -1.58624976e-01, 2.16954497e+00],
[ 1.94853772e-03, 1.07093859e-01, -5.65805888e-01],
[-8.26496390e-02, -1.58624976e-01, 1.11134354e+00],
[-4.61687449e-02, -1.58624976e-01, 1.39086845e+00],
[-6.90061382e-02, -1.58624976e-01, 1.21117387e+00],
[ 1.76297233e-02, -7.00520310e-02, -6.95585308e-01],
[ 1.42728544e-01, -1.58624976e-01, -5.95754985e-01]])
Y = rfm_normalized.iloc[:,4:5].values
Y[:10]
array([[1],
[1],
[1],
[0],
[1],
[3],
[0],
[3],
[1],
[1]])
#One hot encode
ohe = OneHotEncoder()
Y = ohe.fit_transform(Y).toarray()
print('One hot encoded array:')
print(Y[0:5])
One hot encoded array: [[0. 1. 0. 0.] [0. 1. 0. 0.] [0. 1. 0. 0.] [1. 0. 0. 0.] [0. 1. 0. 0.]]
#Train test split of model
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.1,random_state = 0)
print("Done!")
Done!
#Defining the model
model = Sequential()
model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], activation='relu')) #Input layer with 16 units
model.add(Dense(4, activation='relu')) #Hidden layer with 12 units
model.add(Dense(4, activation='softmax'))#Middle layer with 4 units
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 3) 12
dense_1 (Dense) (None, 4) 16
dense_2 (Dense) (None, 4) 20
=================================================================
Total params: 48
Trainable params: 48
Non-trainable params: 0
_________________________________________________________________
model.layers[0].output
<KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'dense')>
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, Y_train, epochs=100, batch_size=64, validation_split = 0.1)
Epoch 1/100 55/55 [==============================] - 1s 4ms/step - loss: 1.3546 - accuracy: 0.8011 - val_loss: 1.2821 - val_accuracy: 0.7923 Epoch 2/100 55/55 [==============================] - 0s 1ms/step - loss: 1.2192 - accuracy: 0.7891 - val_loss: 1.1436 - val_accuracy: 0.7795 Epoch 3/100 55/55 [==============================] - 0s 1ms/step - loss: 1.0784 - accuracy: 0.8428 - val_loss: 0.9937 - val_accuracy: 0.8590 Epoch 4/100 55/55 [==============================] - 0s 1ms/step - loss: 0.9182 - accuracy: 0.8388 - val_loss: 0.8030 - val_accuracy: 0.8231 Epoch 5/100 55/55 [==============================] - 0s 1ms/step - loss: 0.7191 - accuracy: 0.8111 - val_loss: 0.6066 - val_accuracy: 0.8231 Epoch 6/100 55/55 [==============================] - 0s 1ms/step - loss: 0.5581 - accuracy: 0.8216 - val_loss: 0.4743 - val_accuracy: 0.8333 Epoch 7/100 55/55 [==============================] - 0s 1ms/step - loss: 0.4525 - accuracy: 0.8330 - val_loss: 0.3901 - val_accuracy: 0.8462 Epoch 8/100 55/55 [==============================] - 0s 2ms/step - loss: 0.3850 - accuracy: 0.8539 - val_loss: 0.3365 - val_accuracy: 0.8564 Epoch 9/100 55/55 [==============================] - 0s 1ms/step - loss: 0.3400 - accuracy: 0.8750 - val_loss: 0.2998 - val_accuracy: 0.8795 Epoch 10/100 55/55 [==============================] - 0s 1ms/step - loss: 0.3075 - accuracy: 0.8990 - val_loss: 0.2724 - val_accuracy: 0.8974 Epoch 11/100 55/55 [==============================] - 0s 1ms/step - loss: 0.2826 - accuracy: 0.9072 - val_loss: 0.2519 - val_accuracy: 0.9051 Epoch 12/100 55/55 [==============================] - 0s 1ms/step - loss: 0.2625 - accuracy: 0.9178 - val_loss: 0.2345 - val_accuracy: 0.9179 Epoch 13/100 55/55 [==============================] - 0s 1ms/step - loss: 0.2451 - accuracy: 0.9275 - val_loss: 0.2194 - val_accuracy: 0.9231 Epoch 14/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2298 - accuracy: 0.9418 - val_loss: 0.2060 - val_accuracy: 0.9308 Epoch 15/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2156 - accuracy: 0.9449 - val_loss: 0.1943 - val_accuracy: 0.9385 Epoch 16/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2029 - accuracy: 0.9526 - val_loss: 0.1830 - val_accuracy: 0.9436 Epoch 17/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1911 - accuracy: 0.9592 - val_loss: 0.1730 - val_accuracy: 0.9410 Epoch 18/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1805 - accuracy: 0.9603 - val_loss: 0.1638 - val_accuracy: 0.9436 Epoch 19/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1700 - accuracy: 0.9626 - val_loss: 0.1551 - val_accuracy: 0.9590 Epoch 20/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1608 - accuracy: 0.9663 - val_loss: 0.1473 - val_accuracy: 0.9615 Epoch 21/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1523 - accuracy: 0.9735 - val_loss: 0.1404 - val_accuracy: 0.9564 Epoch 22/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1442 - accuracy: 0.9706 - val_loss: 0.1333 - val_accuracy: 0.9769 Epoch 23/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1369 - accuracy: 0.9792 - val_loss: 0.1273 - val_accuracy: 0.9795 Epoch 24/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1300 - accuracy: 0.9752 - val_loss: 0.1216 - val_accuracy: 0.9795 Epoch 25/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1237 - accuracy: 0.9812 - val_loss: 0.1173 - val_accuracy: 0.9641 Epoch 26/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1184 - accuracy: 0.9792 - val_loss: 0.1117 - val_accuracy: 0.9821 Epoch 27/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1134 - accuracy: 0.9817 - val_loss: 0.1074 - val_accuracy: 0.9923 Epoch 28/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1090 - accuracy: 0.9837 - val_loss: 0.1033 - val_accuracy: 0.9923 Epoch 29/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1048 - accuracy: 0.9846 - val_loss: 0.0997 - val_accuracy: 0.9923 Epoch 30/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1008 - accuracy: 0.9872 - val_loss: 0.0962 - val_accuracy: 0.9923 Epoch 31/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0972 - accuracy: 0.9834 - val_loss: 0.0931 - val_accuracy: 0.9923 Epoch 32/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0944 - accuracy: 0.9883 - val_loss: 0.0901 - val_accuracy: 0.9923 Epoch 33/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0912 - accuracy: 0.9883 - val_loss: 0.0873 - val_accuracy: 0.9949 Epoch 34/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0885 - accuracy: 0.9877 - val_loss: 0.0850 - val_accuracy: 0.9949 Epoch 35/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0860 - accuracy: 0.9886 - val_loss: 0.0823 - val_accuracy: 0.9949 Epoch 36/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0835 - accuracy: 0.9894 - val_loss: 0.0807 - val_accuracy: 0.9923 Epoch 37/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0814 - accuracy: 0.9886 - val_loss: 0.0780 - val_accuracy: 0.9974 Epoch 38/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0792 - accuracy: 0.9894 - val_loss: 0.0762 - val_accuracy: 0.9949 Epoch 39/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0774 - accuracy: 0.9903 - val_loss: 0.0743 - val_accuracy: 0.9949 Epoch 40/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0755 - accuracy: 0.9886 - val_loss: 0.0728 - val_accuracy: 0.9974 Epoch 41/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0737 - accuracy: 0.9892 - val_loss: 0.0710 - val_accuracy: 0.9974 Epoch 42/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0722 - accuracy: 0.9894 - val_loss: 0.0694 - val_accuracy: 0.9974 Epoch 43/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0705 - accuracy: 0.9903 - val_loss: 0.0684 - val_accuracy: 0.9974 Epoch 44/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0688 - accuracy: 0.9886 - val_loss: 0.0664 - val_accuracy: 0.9949 Epoch 45/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0676 - accuracy: 0.9892 - val_loss: 0.0654 - val_accuracy: 0.9974 Epoch 46/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0661 - accuracy: 0.9892 - val_loss: 0.0638 - val_accuracy: 0.9949 Epoch 47/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0651 - accuracy: 0.9900 - val_loss: 0.0629 - val_accuracy: 0.9974 Epoch 48/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0638 - accuracy: 0.9906 - val_loss: 0.0616 - val_accuracy: 0.9949 Epoch 49/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0625 - accuracy: 0.9903 - val_loss: 0.0606 - val_accuracy: 0.9974 Epoch 50/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0612 - accuracy: 0.9917 - val_loss: 0.0600 - val_accuracy: 0.9974 Epoch 51/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0602 - accuracy: 0.9897 - val_loss: 0.0586 - val_accuracy: 0.9974 Epoch 52/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0591 - accuracy: 0.9897 - val_loss: 0.0577 - val_accuracy: 0.9949 Epoch 53/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0579 - accuracy: 0.9906 - val_loss: 0.0570 - val_accuracy: 0.9974 Epoch 54/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0568 - accuracy: 0.9906 - val_loss: 0.0556 - val_accuracy: 0.9974 Epoch 55/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0559 - accuracy: 0.9912 - val_loss: 0.0549 - val_accuracy: 0.9974 Epoch 56/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0548 - accuracy: 0.9906 - val_loss: 0.0537 - val_accuracy: 0.9949 Epoch 57/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0540 - accuracy: 0.9906 - val_loss: 0.0532 - val_accuracy: 0.9974 Epoch 58/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0530 - accuracy: 0.9906 - val_loss: 0.0520 - val_accuracy: 0.9949 Epoch 59/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0523 - accuracy: 0.9906 - val_loss: 0.0516 - val_accuracy: 0.9974 Epoch 60/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0514 - accuracy: 0.9909 - val_loss: 0.0511 - val_accuracy: 0.9974 Epoch 61/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0506 - accuracy: 0.9903 - val_loss: 0.0502 - val_accuracy: 0.9974 Epoch 62/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0499 - accuracy: 0.9906 - val_loss: 0.0502 - val_accuracy: 0.9974 Epoch 63/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0489 - accuracy: 0.9897 - val_loss: 0.0493 - val_accuracy: 0.9974 Epoch 64/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0485 - accuracy: 0.9909 - val_loss: 0.0478 - val_accuracy: 0.9974 Epoch 65/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0474 - accuracy: 0.9917 - val_loss: 0.0474 - val_accuracy: 0.9974 Epoch 66/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0468 - accuracy: 0.9906 - val_loss: 0.0466 - val_accuracy: 0.9974 Epoch 67/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0461 - accuracy: 0.9903 - val_loss: 0.0465 - val_accuracy: 0.9974 Epoch 68/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0452 - accuracy: 0.9903 - val_loss: 0.0451 - val_accuracy: 0.9949 Epoch 69/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0448 - accuracy: 0.9909 - val_loss: 0.0449 - val_accuracy: 0.9974 Epoch 70/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0439 - accuracy: 0.9909 - val_loss: 0.0444 - val_accuracy: 0.9974 Epoch 71/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0434 - accuracy: 0.9912 - val_loss: 0.0434 - val_accuracy: 0.9949 Epoch 72/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0427 - accuracy: 0.9929 - val_loss: 0.0428 - val_accuracy: 0.9949 Epoch 73/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0423 - accuracy: 0.9897 - val_loss: 0.0428 - val_accuracy: 0.9974 Epoch 74/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0416 - accuracy: 0.9903 - val_loss: 0.0420 - val_accuracy: 0.9974 Epoch 75/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0411 - accuracy: 0.9912 - val_loss: 0.0412 - val_accuracy: 0.9949 Epoch 76/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0404 - accuracy: 0.9912 - val_loss: 0.0409 - val_accuracy: 0.9923 Epoch 77/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0400 - accuracy: 0.9912 - val_loss: 0.0403 - val_accuracy: 0.9949 Epoch 78/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0394 - accuracy: 0.9914 - val_loss: 0.0401 - val_accuracy: 0.9923 Epoch 79/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0388 - accuracy: 0.9917 - val_loss: 0.0393 - val_accuracy: 0.9949 Epoch 80/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0382 - accuracy: 0.9906 - val_loss: 0.0391 - val_accuracy: 0.9974 Epoch 81/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0376 - accuracy: 0.9917 - val_loss: 0.0387 - val_accuracy: 0.9974 Epoch 82/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0373 - accuracy: 0.9920 - val_loss: 0.0393 - val_accuracy: 0.9974 Epoch 83/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0367 - accuracy: 0.9914 - val_loss: 0.0381 - val_accuracy: 0.9974 Epoch 84/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0363 - accuracy: 0.9917 - val_loss: 0.0370 - val_accuracy: 0.9949 Epoch 85/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0357 - accuracy: 0.9923 - val_loss: 0.0365 - val_accuracy: 0.9949 Epoch 86/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0353 - accuracy: 0.9906 - val_loss: 0.0362 - val_accuracy: 0.9974 Epoch 87/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0350 - accuracy: 0.9909 - val_loss: 0.0356 - val_accuracy: 0.9949 Epoch 88/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0349 - accuracy: 0.9906 - val_loss: 0.0353 - val_accuracy: 0.9974 Epoch 89/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0340 - accuracy: 0.9912 - val_loss: 0.0359 - val_accuracy: 0.9974 Epoch 90/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0337 - accuracy: 0.9914 - val_loss: 0.0349 - val_accuracy: 0.9974 Epoch 91/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0333 - accuracy: 0.9909 - val_loss: 0.0349 - val_accuracy: 0.9974 Epoch 92/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0328 - accuracy: 0.9917 - val_loss: 0.0336 - val_accuracy: 0.9949 Epoch 93/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0324 - accuracy: 0.9912 - val_loss: 0.0337 - val_accuracy: 0.9974 Epoch 94/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0319 - accuracy: 0.9923 - val_loss: 0.0330 - val_accuracy: 0.9949 Epoch 95/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0318 - accuracy: 0.9909 - val_loss: 0.0327 - val_accuracy: 0.9949 Epoch 96/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0313 - accuracy: 0.9897 - val_loss: 0.0333 - val_accuracy: 0.9974 Epoch 97/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0313 - accuracy: 0.9903 - val_loss: 0.0336 - val_accuracy: 0.9974 Epoch 98/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0307 - accuracy: 0.9906 - val_loss: 0.0322 - val_accuracy: 0.9974 Epoch 99/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0303 - accuracy: 0.9906 - val_loss: 0.0324 - val_accuracy: 0.9974 Epoch 100/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0300 - accuracy: 0.9917 - val_loss: 0.0318 - val_accuracy: 0.9974
print("Predicting.....")
y_pred = model.predict(X_test)
#Converting predictions to label
pred = list()
for i in range(len(y_pred)):
pred.append(np.argmax(y_pred[i]))
print("Done!")
Predicting..... 14/14 [==============================] - 0s 850us/step Done!
#Converting one hot encoded test label to label
test = list()
for i in range(len(Y_test)):
test.append(np.argmax(Y_test[i]))
print("Done!")
Done!
print()
print("Pridicted values")
pred[:10]
Pridicted values
[3, 1, 0, 3, 1, 1, 3, 1, 1, 1]
print()
print("Actual values")
test[:10]
Actual values
[3, 1, 0, 3, 1, 1, 3, 1, 1, 1]
a = accuracy_score(pred,test)
print('Accuracy of model on test data is:', a*100)
Accuracy of model on test data is: 99.76905311778292
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
import pickle
print("Saving model....");
pickle.dump(model, open('Behavioural_Segmentation_model.pkl','wb'))
print("Model saved");
Saving model.... INFO:tensorflow:Assets written to: ram://cb2ad320-36ba-4252-a332-5f6337684eb0/assets Model saved
#Function to predict customer group from user input
def run_model(Recency_pred, Frequency_pred, Monetary_pred):
pred_data = {'Monetary':[Monetary_pred],'Frequency':[Frequency_pred], 'Recency':[Recency_pred]}
pred_data=pd.DataFrame(pred_data)
print()
print("UNSCALED DATA SET")
print(pred_data)
pred_data = pd.DataFrame(scaler.transform(pred_data), columns=pred_data.columns)
print()
print("SCALED DATA SET")
print(pred_data)
print()
print("PREDICTING......")
X1=np.array(pred_data)
print(X1)
result = model.predict(X1)
k = list()
for i in range(len(result)):
k.append(np.argmax(result[i]))
out = Output(layout={'border': '1px solid white'})
with out:
print(k)
return out
interact(run_model, Recency_pred = BoundedFloatText(value = 0, min = 0, max=373, step = 1, description = 'Recency'),
Frequency_pred = BoundedFloatText(value = 0, min = 1, step = 1, max=1402, description = 'Frequency'),
Monetary_pred = BoundedFloatText(value = 2.9, min = 2.9, step = 1,max=279765.02, description = 'Monetary'),)
interactive(children=(BoundedFloatText(value=0.0, description='Recency', max=373.0, step=1.0), BoundedFloatTex…
<function __main__.run_model(Recency_pred, Frequency_pred, Monetary_pred)>